Load script and directories

# Source script from top-level project folder
source("../process_plate_run.R")
#— 1. pull from params (or use defaults)
plot_dir   <- params$plot_dir   %||% "output_prot/plots"
data_dir   <- params$data_dir   %||% "output_prot/export_data"
report_dir <- params$report_dir %||% "output_prot/reports"

#— 2. create all dirs
walk(
  c(plot = plot_dir, data = data_dir, reports = report_dir),
  ~ if (!dir_exists(.x)) dir_create(.x, recurse = TRUE)
)

#— 3. register plot_dir with knitr
opts_chunk$set(fig.path = paste0(plot_dir, "/"))
#

Load input data

# 1. Define the folder containing all Excel inputs
input_folder <- "Input_prot"

# 2. Find all .xlsx files (full paths), excluding temp files that start with "~$"
excel_files <- list.files(
  path = input_folder, 
  pattern = "\\.xlsx?$", 
  full.names = TRUE
)
excel_files <- excel_files[!grepl("^~\\$", basename(excel_files))]

# 3. Loop over each file
for (file in excel_files) {
  # 3a. Create a clean object name (strip out folder & extension)
  name <- tools::file_path_sans_ext(basename(file))
  
  # 3b. Determine which sheet to read (second if possible, otherwise first)
  sheet_names   <- excel_sheets(file)
  sheet_to_read <- if (length(sheet_names) >= 2) sheet_names[2] else sheet_names[1]
  
  # 3c. Read the chosen sheet, suppressing verbose messages
  data <- suppressMessages(read_excel(file, sheet = sheet_to_read))
  
  # 3d. Assign the data.frame to the global environment under "name"
  assign(name, data, envir = .GlobalEnv)
  
  # 3e. Print a message to confirm successful load
  message("Loaded: ", name, " (sheet = '", sheet_to_read, "')")
}

Tidy photospectrometer input data into dataframes and correct blanks

# 1. Define which wells were used as blanks in each run
blanks1 <- c("A01", "A02", "A03")
blanks2 <- c("H11", "H12")
#
# 2. Build named lists of raw datasets and their corresponding blank vectors
listprot    <- list(pr1 = pr1, pr2 = pr2)
#
listBlanks <- list(blanks1, blanks2)
#
# 3. Run the wrapper: it calls tidy_and_correct() on each dataset
tidy_all(listprot, listBlanks)

STEP 5: JOIN THE DATA WITH ITS SAMPLE WEIGHTS SPREADSHEET

Description of joindf_by_id Function

The joindf_by_id function takes two data frames (df1 and df2) and merges them by matching unique identifiers related to samples, specifically using either a Cell_ID column or a plate well column.
Key steps and features: - Column Cleaning: Trims whitespace from column names in both data frames to avoid join errors caused by accidental spaces. - Key Column Verification: Checks that at least one data frame contains a Cell_ID column and the other contains a plate well column—these serve as the join keys. - Role Assignment: Depending on which data frame contains Cell_ID, that data frame is assigned as the base (df_cell), and the other becomes the joining data (df_plate). - Rename Join Keys: Renames both join columns to a common key name (join_id) to facilitate a straightforward left join. - Perform Join: Conducts a left join, keeping all rows from the base data frame and adding matching data from the other. - Identify Unmatched Rows: Any rows in the larger data frame without matches are saved separately for troubleshooting. - Output Files:
- Saves the merged data frame as a CSV named according to the provided output_name.
- Writes unmatched rows into a separate CSV file.
- Global Environment Assignment: Assigns the merged data frame into the global R environment under the same name as the output file (minus the .csv extension). - Reporting: Prints messages listing any unmatched identifiers and returns a summary report containing counts of matched/unmatched rows and file paths of saved CSVs.

# 1. Create output subdirectory
# 1. Create subdirectory for joined weights inside data_dir
joined_weights_dir <- file.path(data_dir, "joined_weights_prot")
if (!dir.exists(joined_weights_dir)) dir.create(joined_weights_dir, recursive = TRUE)

# 1. Build weight and PE data frame lists
list_weights <- list(
  pr1_weights,
  pr2_weights
)

pr_list <- list(
  pr1 = pr1_tidy, 
  pr2 = pr2_tidy
)

# 2. Extract names to use in assign_name and filenames
df_names <- names(pr_list)

# 3. Join + save using joindf_by_id
mapply(
  function(df1, df2, name) {
    assign_name <- paste0(name, "_weights_joined")
    csv_path    <- file.path(joined_weights_dir, paste0(assign_name, ".csv"))
    
    joindf_by_id(
      df1         = df1,
      df2         = df2,
      key_df1     = "Cell_ID",
      key_df2     = "plate well",
      assign_name = assign_name,
      save_csv_path = csv_path
    )
  },
  df1 = pr_list,
  df2 = list_weights,
  name = df_names,
  SIMPLIFY = FALSE
)
## $assigned_to
## [1] "pr1_weights_joined"
## 
## $rows_total
## [1] 96
## 
## $columns_total
## [1] 8
## 
## $unmatched_rows
## [1] 0
## $assigned_to
## [1] "pr2_weights_joined"
## 
## $rows_total
## [1] 96
## 
## $columns_total
## [1] 7
## 
## $unmatched_rows
## [1] 0
## $pr1
## # A tibble: 96 × 8
##    join_id      X600     X650     X750 ID      weights `Tray cell`
##    <chr>       <dbl>    <dbl>    <dbl> <chr>     <dbl> <chr>      
##  1 A01     -0.00667  -0.00733 -0.00833 Blank01     0   A1         
##  2 A02     -0.000667 -0.00233 -0.00433 Blank02     0   A2         
##  3 A03      0.00733   0.00967  0.0127  Blank03     0   A3         
##  4 A04      0.136     0.156    0.187   Lab_01     10.7 A7         
##  5 A05      0.116     0.133    0.160   Lab_01      9.6 A8         
##  6 A06      0.157     0.180    0.216   Lab_01     10.6 A9         
##  7 A07      0.0433    0.0527   0.0687  Lab_02     11   A10        
##  8 A08      0.0833    0.0917   0.106   Lab_02     10.3 A11        
##  9 A09      0.0553    0.0667   0.0777  Lab_02     11   A12        
## 10 A10      0.0433    0.0507   0.0617  Lab_03     11   A13        
## # ℹ 86 more rows
## # ℹ 1 more variable: date <dttm>
## 
## $pr2
## # A tibble: 96 × 7
##    join_id  X600 ID     weights `sample weight` `Tray well` date               
##    <chr>   <dbl> <chr>    <dbl>           <dbl> <chr>       <dttm>             
##  1 A01     0.289 Lab_01    26.6               1 a1          2025-05-30 00:00:00
##  2 A02     0.196 Lab_01    27                 2 a2          2025-05-30 00:00:00
##  3 A03     0.165 Lab_02    24.1               3 a3          2025-05-30 00:00:00
##  4 A04     0.234 Lab_02    25.9               4 a4          2025-05-30 00:00:00
##  5 A05     0.332 Lab_03    25                 5 a5          2025-05-30 00:00:00
##  6 A06     0.316 Lab_03    26                 6 a6          2025-05-30 00:00:00
##  7 A07     0.194 Lab_04    25.3               7 a7          2025-05-30 00:00:00
##  8 A08     0.132 Lab_04    23.9               8 a8          2025-05-30 00:00:00
##  9 A09     0.240 Lab_05    23.8               9 a9          2025-05-30 00:00:00
## 10 A10     0.262 Lab_05    25.7              10 a10         2025-05-30 00:00:00
## # ℹ 86 more rows

Create Standard Curve from Lowry Assay

# Example data: Replace this with your actual data
# You can also use `read.csv("your_data.csv")` if loading from a file

# Fit linear model
model <- lm(X600_cor ~ concentration, data = calibration_prot)

# Summary of the model (for table output, if needed)
summary(model)
## 
## Call:
## lm(formula = X600_cor ~ concentration, data = calibration_prot)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06295 -0.02037 -0.01329  0.02079  0.06805 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.013806   0.021665   0.637    0.538    
## concentration 0.155266   0.007345  21.139 1.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03983 on 10 degrees of freedom
## Multiple R-squared:  0.9781, Adjusted R-squared:  0.9759 
## F-statistic: 446.8 on 1 and 10 DF,  p-value: 1.248e-09
# Plot
prot_curve_plot <- ggplot(calibration_prot, aes(x = concentration, y = X600_cor)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(
    title = "Protein Standard Curve (Lowry Method)",
    x = "Protein Concentration (mg/mL)",
    y = "Absorbance (600 nm)"
  ) +
  annotate(
    "text",
    x = max(calibration_prot$concentration) * 0.5, 
    y = max(calibration_prot$X600_cor) * 0.9,
    label = paste0("y = ", round(coef(model)[2], 4), "x + ", round(coef(model)[1], 4),
                   "\nR² = ", round(summary(model)$r.squared, 4)),
    size = 4, hjust = 0
  ) +
  theme_minimal()

# Save using save_object()
save_object(prot_curve_plot,
            filename  = "protein_standard_curve",
            directory = "plots",
            width     = 6,
            height    = 5,
            dpi       = 300)
print(prot_curve_plot)

## Calculate protein concentration with standard curve regression

# Use the regression model to predict concentration from absorbance
export_dir <- "output_prot/export_data"
plot_dir <- file.path("output_prot", "plots")
dir.create(export_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)

# ---- Step 1: Calibration
intercept <- coef(model)[1]
slope     <- coef(model)[2]

# ---- Step 2: Harmonize inputs
common_cols <- intersect(names(pr2_weights_joined), names(pr1_weights_joined))
pr1_weights_joined <- pr1_weights_joined %>% mutate(weights = as.numeric(weights), date = parse_date_time(date, orders = c("ymd", "dmy", "mdy")))
pr2_weights_joined <- pr2_weights_joined %>% mutate(weights = as.numeric(weights), date = parse_date_time(date, orders = c("ymd", "dmy", "mdy")))

# ---- Step 3: Combine datasets and calculate concentrations
pr_combined <- bind_rows(pr2_weights_joined[common_cols], pr1_weights_joined[common_cols])
pr_combined$con_mg_per_ml <- (pr_combined$X600 - intercept) / slope
pr_combined$Protein_mg_total <- pr_combined$con_mg_per_ml * 0.5
pr_combined$Protein_mg_per_g <- (pr_combined$Protein_mg_total / pr_combined$weights) * 1000

# ---- Step 4: Filter and factor runs
pr_combined <- pr_combined %>%
  filter(is.finite(Protein_mg_per_g), !is.na(Protein_mg_per_g)) %>%
  mutate(run = as.factor(match(as.Date(date), unique(as.Date(date)))))

# Save combined data
write_csv(pr_combined, file.path(export_dir, "pr_combined.csv"))

# ---- Step 5: Statistical tests
t_test_result <- t.test(Protein_mg_per_g ~ run, data = pr_combined)
wilcox_result <- wilcox.test(Protein_mg_per_g ~ run, data = pr_combined)

# Save test summaries
capture.output(t_test_result,   file = file.path(export_dir, "t_test_run_comparison.txt"))
capture.output(wilcox_result,  file = file.path(export_dir, "wilcox_test_run_comparison.txt"))

# ---- Step 6: Plot protein content by run
protein_run_plot <- ggplot(pr_combined, aes(x = run, y = Protein_mg_per_g, fill = run)) +
  geom_boxplot(outlier.shape = NA, width = 0.6) +
  geom_jitter(width = 0.1, alpha = 0.5, color = "black") +
  labs(title = "Protein Content by Run", x = "Run", y = "Protein (mg/g)") +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

save_object(
  object    = protein_run_plot,
  filename  = "protein_content_by_run",
  directory = "plots",
  subdir    = "replicate_analysis/protein"
)

# ---- Step 7: Outlier detection
df_runs12 <- pr_combined %>% filter(run %in% c("1", "2"))
iqr_vals <- IQR(df_runs12$Protein_mg_per_g, na.rm = TRUE)
q1 <- quantile(df_runs12$Protein_mg_per_g, 0.25, na.rm = TRUE)
q3 <- quantile(df_runs12$Protein_mg_per_g, 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_vals
upper_bound <- q3 + 1.5 * iqr_vals

pr_combined <- pr_combined %>%
  mutate(outlier_flag = ifelse(
    Protein_mg_per_g < lower_bound | Protein_mg_per_g > upper_bound,
    "outlier", "normal"
  ))

# Save outlier-flagged data
write_csv(pr_combined, file.path(export_dir, "pr_combined_with_outliers.csv"))

pr_combined_clean <- pr_combined %>% filter(outlier_flag == "normal")
write_csv(pr_combined_clean, file.path(export_dir, "pr_combined_clean.csv"))

# ---- Step 8: Replicate analysis
analyze_replicates(
  data          = pr_combined_clean,
  id_col        = "ID",
  join_col      = "join_id",
  weight_col    = "weights",
  date_col      = "date",
  output_prefix = "pr_rep",
  choose_best_3 = TRUE,
  dir           = export_dir
)
## # A tibble: 41 × 37
##    ID     Protein_mg_per_g_mean Protein_mg_total_mean X600_mean
##    <fct>                  <dbl>                 <dbl>     <dbl>
##  1 Lab_01                  34.8                 0.481     0.163
##  2 Lab_02                  18.1                 0.282     0.101
##  3 Lab_03                  40.1                 0.796     0.261
##  4 Lab_04                  23.2                 0.342     0.120
##  5 Lab_05                  28.8                 0.252     0.092
##  6 Lab_06                  47.3                 1.17      0.378
##  7 Lab_07                  27.8                 0.466     0.158
##  8 Lab_08                  24.7                 0.210     0.079
##  9 Lab_09                  31.6                 0.344     0.121
## 10 Lab_10                  30.0                 0.465     0.158
## # ℹ 31 more rows
## # ℹ 33 more variables: con_mg_per_ml_mean <dbl>, Protein_mg_per_g_sd <dbl>,
## #   Protein_mg_total_sd <dbl>, X600_sd <dbl>, con_mg_per_ml_sd <dbl>,
## #   Protein_mg_per_g_se <dbl>, Protein_mg_total_se <dbl>, X600_se <dbl>,
## #   con_mg_per_ml_se <dbl>, Protein_mg_per_g_cv <dbl>,
## #   Protein_mg_total_cv <dbl>, X600_cv <dbl>, con_mg_per_ml_cv <dbl>,
## #   Protein_mg_per_g_max_dev_pct <dbl>, Protein_mg_total_max_dev_pct <dbl>, …
# ---- Step 9: Replicate summary and plot
protein_summary <- read_csv(file.path(export_dir, "pr_rep_summary.csv"))

protein_var     <- "Protein_mg_per_g"
se_col_name     <- paste0(protein_var, "_se")
mean_col_name   <- paste0(protein_var, "_mean")

output_prefix <- file.path(plot_dir, "replicate_analysis", paste0(protein_var, "_replicate_analysis"))
dir.create(dirname(output_prefix), recursive = TRUE, showWarnings = FALSE)

protein_plot <- graph_replicates_custom_error(
  data          = protein_summary,
  id_col        = "ID",
  value_col     = mean_col_name,
  se_col        = se_col_name,
  output_prefix = output_prefix
)

save_object(
  object    = protein_plot,
  filename  = paste0(protein_var, "_replicate_plot"),
  directory = "plots",
  format    = "html",
  subdir    = "replicate_analysis/protein"
)

# ---- Step 10: Summary stats
summary_stats <- protein_summary %>%
  summarise(
    total_rows       = n(),
    high_cv_count    = sum(Protein_mg_per_g_cv > 0.2, na.rm = TRUE),
    average_cv       = mean(Protein_mg_per_g_cv, na.rm = TRUE),
    high_cv_percent  = mean(Protein_mg_per_g_cv > 0.2, na.rm = TRUE) * 100
  )

write_csv(summary_stats, file.path(export_dir, "protein_summary_stats.csv"))

# Also print to console (optional)
print(summary_stats)
## # A tibble: 1 × 4
##   total_rows high_cv_count average_cv high_cv_percent
##        <int>         <int>      <dbl>           <dbl>
## 1         41            13      0.152            31.7

Analize protein content by sample

# 1. Merge enhanced replicate summary with metadata by column "ID"
#    This will save "PErep_final.csv" and assign the merged data frame to `PErep_final`.
# 1. Define export folder and create it if needed
final_export_dir <- file.path("output prot", "export data", "Samples Analysis Final")
dir.create(final_export_dir, recursive = TRUE, showWarnings = FALSE)

#

# 2. Join PErep_enhanced with Sample data
joindf_by_id(
  df1          = protein_summary,
  df2          = `Sample data`,
  save_csv_path  = file.path(final_export_dir, "pr_rep_final.csv"),
  assign_name  = "pr_rep_final",
  key_df1      = "ID",
  key_df2      = "ID"
)
## $assigned_to
## [1] "pr_rep_final"
## 
## $rows_total
## [1] 69
## 
## $columns_total
## [1] 44
## 
## $unmatched_rows
## [1] 0
## # A tibble: 69 × 44
##    join_id Protein_mg_per_g_mean Protein_mg_total_mean X600_mean
##    <chr>                   <dbl>                 <dbl>     <dbl>
##  1 Lab_01                   34.8                 0.481     0.163
##  2 Lab_02                   18.1                 0.282     0.101
##  3 Lab_03                   40.1                 0.796     0.261
##  4 Lab_04                   23.2                 0.342     0.120
##  5 Lab_05                   28.8                 0.252     0.092
##  6 Lab_06                   47.3                 1.17      0.378
##  7 Lab_07                   27.8                 0.466     0.158
##  8 Lab_08                   24.7                 0.210     0.079
##  9 Lab_09                   31.6                 0.344     0.121
## 10 Lab_10                   30.0                 0.465     0.158
## # ℹ 59 more rows
## # ℹ 40 more variables: con_mg_per_ml_mean <dbl>, Protein_mg_per_g_sd <dbl>,
## #   Protein_mg_total_sd <dbl>, X600_sd <dbl>, con_mg_per_ml_sd <dbl>,
## #   Protein_mg_per_g_se <dbl>, Protein_mg_total_se <dbl>, X600_se <dbl>,
## #   con_mg_per_ml_se <dbl>, Protein_mg_per_g_cv <dbl>,
## #   Protein_mg_total_cv <dbl>, X600_cv <dbl>, con_mg_per_ml_cv <dbl>,
## #   Protein_mg_per_g_max_dev_pct <dbl>, Protein_mg_total_max_dev_pct <dbl>, …
# 4. Define output directory for replicate analysis plots
rep_plot_dir <- file.path("output_prot", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)

# 5. Generate histogram with error bars for PE_mg_per_g_sample_mean
graph_replicates_custom_error(
  data          = pr_rep_final,
  id_col        = "Location",
  value_col     = "Protein_mg_per_g_mean",
  se_col        = "Protein_mg_per_g_se",
  output_prefix = file.path(rep_plot_dir, "protein_rep_analy_bylocation")
)
pr_location <- pr_rep_final %>%
  filter(!Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh"))

pr_location_cham <- pr_rep_final %>%
  filter(!variety %in% c("F.Glom"))
# 6. Run group comparisons and print outputs

###################################################Location
compare_groups(
  data         = pr_location_cham,
  response_var = "Protein_mg_per_g_mean",
  group_var    = "Location",
  subfolder_name = "pr_Location_cham"
)

################################################Life_S
compare_groups(
  data         = pr_location_cham,
  response_var = "Protein_mg_per_g_mean",
  group_var    = "Life_S",
  subfolder_name = "pr_Life_S_cham"
)

###########################################Variety
pr_paracas_marcona <- pr_rep_final %>%
  filter(Location %in% c("Mendieta", "7H", "Caro Caido"))

compare_groups(
  data         = pr_paracas_marcona,
  response_var = "Protein_mg_per_g_mean",
  group_var    = "variety",
  subfolder_name = "pr_variety"
)

#################################Gam Cham

pr_gamtetra <- pr_location %>%
  filter(Life_S %in% c("Gam/Tetra", "Gam", "Tetra"))

compare_groups(
  data         = pr_gamtetra,
  response_var = "Protein_mg_per_g_mean",
  group_var    = "Location",
  subfolder_name = "pr_gamtetra_location"
)